library(tidyverse)

# Read in the Rds file scraped in the Jupyter notebooks (Python)
data <- readRDS("/home/ubuntu/data/social-media-disinformation/tweets/20210707_scrape.Rds")

# define which columns are factors
factorCols <- c("author_id", "author_name", "author_username","lang","conversation_id","query")

# Clean the dataset
data <- data %>%
  filter(!str_detect(text, "wenda_phayy")) #Remove irrelevant tweets for username wenda (not the wenda we are interested in)

data <- data %>%
  mutate_at(factorCols, factor) %>%
  mutate(is_reply = str_detect(tweet_type,'replied_to'),
         is_retweet = str_detect(tweet_type,'retweeted'),
         is_quote = str_detect(tweet_type,'quoted'),
         is_original = ifelse(tweet_type=='[]', TRUE, FALSE),
         tweet_created_at = as.POSIXct(tweet_created_at, format="%Y-%m-%dT%H:%M:%OSZ"),
         author_created = as.POSIXct(author_created, format="%Y-%m-%dT%H:%M:%OSZ"),
         tweet_url = paste0("https://twitter.com/",data$author_username, "/status/", data$id)
  ) %>%
  group_by(id, .drop=FALSE) %>%
  mutate(papuanlivesmatter = any(query=='"#papuanlivesmatter"'),
         koman = any(query=='"beasiswa veronica koman"'),
         bin_nugraha = any(query=='"Papua BIN" OR "Papua Nugraha"'),
         nkri = any(query=='"papua NKRI"'),
         teroris_kkb = any(query=='"papua teroris" OR "papua KKB"'),
         UNassembly = any(query=='"sidang umum pbb papua" OR "unga papua" OR "vanuatu papua" OR "#sidangpbb"'),
         ulmwp_wenda = any(query=='"ULMWP" OR "wenda"'),
         FaktadiPapua = any(query=="#FaktadiPapua"),
         nduga = any(query=="nduga"),
         otsus = any(query=="otsus"),
         rasisme = any(query=="rasisme"),
         zanambani = any(query=="zanambani")
  ) %>%
  ungroup() %>%
  select(-query, -tweet_type) %>%
  distinct(id, .keep_all = TRUE)

# Save compressed and wrangled dataframe
saveRDS(data, file = paste0("/home/master/shared_folder/Data/", format(Sys.Date(),"%Y%m%d"), "_wrangled.rds"))
